{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# [PUBLIC] Analyse kernel profiling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Platforms"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Odroid-XU3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "- [CPU] quad-core ARM Cortex-A15 @ 2000 MHz (\"big\");\n",
    "- [CPU] quad-core ARM Cortex-A7 @ 1400 MHz (\"LITTLE\");\n",
    "- [GPU] quad-core ARM Mali-T628 @ 600 MHz;\n",
    "- [GPU] dual-core ARM Mali-T628 @ 600 MHz (not used);\n",
    "- [GPU] OpenCL driver 12.0 (\"r12p0\");\n",
    "- [GPU] OpenCL standard 1.2;\n",
    "- [RAM] 2 GB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "platform_id = 'odroid-xu3'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Firefly-RK3399"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "- [CPU] dual-core ARM Cortex-A72 @ 1800 MHz (\"big\");\n",
    "- [CPU] quad-core ARM Cortex-A53 @ 1416 MHz (\"LITTLE\");\n",
    "- [GPU] quad-core ARM Mali-T860 @ 800 MHz;\n",
    "- [GPU] OpenCL driver 13.0 (\"r13p0-00rel0-git(a4271c9)\");\n",
    "- [GPU] OpenCL standard 1.2;\n",
    "- [RAM] 4 GB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# platform_id = 'firefly-rk3399'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "<a id=\"data\"></a>\n",
    "## Get the experimental data from DropBox"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "**NB:** Please ignore this section if you are not interested in re-running or modifying this notebook. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The experimental data was collected on the experimental platform and archived as follows:\n",
    "```\n",
    "$ cd `ck find ck-caffe:script:dvdt-prof`\n",
    "$ python explore-dvdt-prof-libs-models-benchmarking.py\n",
    "$ ck zip local:experiment:dvdt-prof-* --archive_name=ck-caffe-dvdt-prof-<platform_id>.zip\n",
    "```\n",
    "The data can be downloaded and extracted as follows:\n",
    "\n",
    "```\n",
    "$ wget http://dl.dropboxusercontent.com/u/<...>/ck-caffe/public/ck-caffe-dvdt-prof-<platform_id>.zip\n",
    "$ ck add repo:ck-caffe-dvdt-prof-<platform_id> --zip=ck-caffe-dvdt-prof-<platform_id>.zip --quiet\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Includes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Standard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import json\n",
    "import time\n",
    "import math\n",
    "import operator"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Scientific"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import IPython as ip\n",
    "import numpy as np\n",
    "import scipy as sp\n",
    "import pandas as pd\n",
    "import matplotlib as mp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "print('IPython version: %s' % ip.__version__)\n",
    "print('NumPy version: %s' % np.__version__)\n",
    "print('SciPy version: %s' % sp.__version__)\n",
    "print('Pandas version: %s' % pd.__version__)\n",
    "print('Matplotlib version: %s' % mp.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.style.use('classic') # https://matplotlib.org/users/dflt_style_changes.html\n",
    "plt.rcParams.update({'font.size': 20})\n",
    "\n",
    "from matplotlib import cm\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from IPython.display import display"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Collective Knowledge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import ck.kernel as ck\n",
    "print('CK version: %s' % ck.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# NB: Install dvdt-prof first e.g. as \"ck install ck-caffe/package/tool-dvdt-prof-cjson\".\n",
    "r=ck.access({'action':'show', 'module_uoa':'env', 'tags':'tool,opencl,dvdt,prof'})\n",
    "if r['return']>0:\n",
    "    print (\"Error: %s\" % r['error'])\n",
    "    exit(1)\n",
    "# Get the path to the first returned environment entry.\n",
    "dvdt_prof_dir=r['lst'][0]['meta']['env']['CK_ENV_TOOL_DVDT_PROF']\n",
    "dvdt_prof_src_python=os.path.join(dvdt_prof_dir,'src','python')\n",
    "sys.path.append(dvdt_prof_src_python)\n",
    "import prof_wrangler as pw\n",
    "import prof_common as pc\n",
    "pw.test()\n",
    "pc.test()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Access experimental results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def get_experimental_results(repo_uoa, tags='dvdt-prof'):\n",
    "    module_uoa = 'experiment'\n",
    "    r=ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})\n",
    "    if r['return']>0:\n",
    "        print (\"Error: %s\" % r['error'])\n",
    "        exit(1)\n",
    "\n",
    "    results = pd.DataFrame()\n",
    "    dfs = []\n",
    "    experiments=r['lst']\n",
    "    \n",
    "    for experiment in experiments:\n",
    "        repo_uoa = experiment['repo_uoa']\n",
    "        data_uoa = experiment['data_uoa']\n",
    "        r = ck.access({'action':'list_points', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})\n",
    "        if r['return']>0:\n",
    "            print (\"Error: %s\" % r['error'])\n",
    "            exit(1)\n",
    "        \n",
    "        # Get (lib_tag, model_tag) from a list of tags that should be available in r['dict']['tags'].\n",
    "        # Tags include 2 of the 3 irrelevant tags, a model tag and a lib tag.\n",
    "        # NB: Since it's easier to list all model tags than all lib tags, the latter list is not expicitly specified.\n",
    "        tags = r['dict']['tags']\n",
    "        irrelevant_tags = [ 'dvdt-prof', 'caffe-time-opencl' ]\n",
    "        model_tags = [ 'bvlc-alexnet', 'bvlc-googlenet', 'deepscale-squeezenet-1.0', 'deepscale-squeezenet-1.1' ]\n",
    "        lib_model_tags = [ tag for tag in tags if tag not in irrelevant_tags ]\n",
    "        model_tags = [ tag for tag in lib_model_tags if tag in model_tags ]\n",
    "        lib_tags = [ tag for tag in lib_model_tags if tag not in model_tags ]\n",
    "        if len(lib_tags)==1 and len(model_tags)==1:\n",
    "             (lib, model) = (lib_tags[0], model_tags[0])\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        for point in r['points']:\n",
    "            with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:\n",
    "                point_data_raw = json.load(point_file)\n",
    "            characteristics_list = point_data_raw['characteristics_list']\n",
    "            num_repetitions = len(characteristics_list)\n",
    "            # DataFrame columns.\n",
    "            data = [\n",
    "                {\n",
    "                    # features\n",
    "                    'platform' : point_data_raw['features']['platform']['platform']['model'],\n",
    "                    # choices\n",
    "                    'lib' : lib,\n",
    "                    'model' : model,\n",
    "                    'batch_size' : np.int64(point_data_raw['choices']['env'].get('CK_CAFFE_BATCH_SIZE',[])),\n",
    "                    # statistical repetitions\n",
    "                    'repetition_id' : repetition_id,\n",
    "                    # runtime characteristics\n",
    "                    'time (ms)'     : np.float32(characteristics['run'].get('time_fw_ms',0)),\n",
    "                    'dvdt_prof_info': characteristics['run'].get('dvdt_prof',[]),\n",
    "                    'per_layer_info': characteristics['run'].get('per_layer_info',[]),\n",
    "                }\n",
    "                for (repetition_id, characteristics) in zip(range(num_repetitions), characteristics_list)  \n",
    "                #if characteristics['run'].get('run_success','')!=''\n",
    "            ]\n",
    "            df = pd.DataFrame(data)\n",
    "            df.columns.name = 'characteristics'\n",
    "            df.index.name = 'index'\n",
    "            df = df.set_index([ 'platform', 'lib', 'model', 'batch_size', 'repetition_id' ])\n",
    "            dfs.append(df)\n",
    "        results = pd.concat(dfs).sortlevel()\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = get_experimental_results(repo_uoa='ck-caffe-dvdt-prof-'+platform_id, tags='dvdt-prof')\n",
    "pd.options.display.max_columns = len(df.columns)\n",
    "pd.options.display.max_rows = len(df.index)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Check execution time distribution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "**NB:** The total execution time includes the profiling overhead, so should not be interpreted as the indicative performance of a platform. The kernel execution time and the derived GFLOPS should be accurate."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "pd.options.display.max_columns = len(df.columns)\n",
    "pd.options.display.max_rows = len(df.index)*8\n",
    "df.groupby(level=df.index.names[:-1])[['time (ms)']].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Plot execution time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def plot(mean, std, rot=0):\n",
    "    mean \\\n",
    "        .plot(yerr=std, title='Execution time (ms)', kind='bar', colormap=cm.autumn,\n",
    "            figsize=[16, 8], rot=rot, grid=True, legend=True) \\\n",
    "        .legend(loc='upper left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "df_mean = df.groupby(level=df.index.names[:-1])['time (ms)'].mean().unstack('lib')\n",
    "df_std = df.groupby(level=df.index.names[:-1])['time (ms)'].std().unstack('lib')\n",
    "plot(df_mean, df_std, rot=45)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Show profiling info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "df_min = df \\\n",
    "    .ix[df.groupby(level=df.index.names[:-1])['time (ms)'].idxmin()] \\\n",
    "    .reset_index('repetition_id', drop=True)\n",
    "df_min"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "batch_size = 1\n",
    "df_model_lib = df_min[['dvdt_prof_info']] \\\n",
    "    .reset_index('platform', drop=True) \\\n",
    "    .reorder_levels([ 'batch_size', 'model', 'lib']) \\\n",
    "    .loc[batch_size] \\\n",
    "    .sortlevel()\n",
    "df_model_lib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "unit='ms'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Analyse models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "models = df_model_lib.index.levels[0]\n",
    "libs = df_model_lib.index.levels[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def concat(model, lib):\n",
    "    return '%s:%s' % (model, lib)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def analyse_model_lib(df_model_lib, model, lib, min_pc=1.0):\n",
    "    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])\n",
    "    # All kernel enqueues.\n",
    "    df_kernel_enqueues = pw.df_kernel_enqueues(pw.filter_calls(trace, ['clEnqueueNDRangeKernel']), unit='ms')\n",
    "    # Kernel enqueues that take at least 'min_pc' % of the execution time.\n",
    "    df_kernel_enqueues_cum_time_num = pw.df_kernel_enqueues_cumulative_time_num(df_kernel_enqueues, unit)\n",
    "    df_kernel_enqueues_cum_time_num.columns.name = concat(model, lib)\n",
    "    return df_kernel_enqueues_cum_time_num[df_kernel_enqueues_cum_time_num['** Execution time (%) **'] > min_pc]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "model_lib_analysis = {}\n",
    "for model in models:\n",
    "    for lib in libs:\n",
    "        title = concat(model, lib)\n",
    "        print('== %s ==' % title)\n",
    "        try:\n",
    "            analysis = analyse_model_lib(df_model_lib, model, lib, min_pc=0.0)\n",
    "        except:\n",
    "            print('... missing ...'); print(''); continue\n",
    "        model_lib_analysis[title] = analysis\n",
    "        pd.options.display.max_columns = analysis.columns.size\n",
    "        pd.options.display.max_rows = analysis.index.size\n",
    "        display(analysis)\n",
    "        print('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Compare no-tune/tune"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "pd.DataFrame \\\n",
    "    .join(\n",
    "        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast'][['** Execution time (ms) **']],\n",
    "        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast-tune'][['** Execution time (ms) **']],\n",
    "        lsuffix=' deepscale-squeezenet-1.1:opencl-clblast **',\n",
    "        rsuffix=' deepscale-squeezenet-1.1:opencl-clblast-tune **',\n",
    "        how='outer'\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast'][['** Execution time (ms) **']].sum()        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast-tune'][['** Execution time (ms) **']].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "plot(\n",
    "    mean=pd.DataFrame.join(\n",
    "        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast'][['** Execution time (ms) **']],\n",
    "        model_lib_analysis['deepscale-squeezenet-1.1:opencl-clblast-tune'][['** Execution time (ms) **']],\n",
    "        lsuffix=' deepscale-squeezenet-1.1:opencl-clblast **',\n",
    "        rsuffix=' deepscale-squeezenet-1.1:opencl-clblast-tune **',\n",
    "        how='outer'\n",
    "    ), std=pd.DataFrame(), rot=90)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Analyse xGEMM kernels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def init_buckets(left=0, right=15):\n",
    "    powers_of_two = [ pow(2, i) for i in range(left, right) ]\n",
    "    buckets = [\n",
    "        (mm, nn, kk)\n",
    "        for mm in powers_of_two\n",
    "        for nn in powers_of_two\n",
    "        for kk in powers_of_two\n",
    "    ]\n",
    "    return buckets\n",
    "\n",
    "def distance((x1, y1, z1), (x2, y2, z2)):\n",
    "    dx = x2-x1\n",
    "    dy = y2-y1\n",
    "    dz = z2-z1\n",
    "    s = np.float64(dx**2 + dy**2 + dz**2)\n",
    "    return math.sqrt(s)\n",
    "\n",
    "# Returns the bucket nearest to the triple according to the metric computed by distance().\n",
    "# buckets is a list of power-of-two triples generated by init_buckets().\n",
    "def get_nearest_bucket(buckets, triple):\n",
    "    bucket = (-1, -1, -1)\n",
    "    min_distance = np.float('inf')\n",
    "    for cur_bucket in buckets:\n",
    "        cur_distance = distance(triple, cur_bucket)\n",
    "        if cur_distance < min_distance:\n",
    "            min_distance = cur_distance\n",
    "            bucket = cur_bucket\n",
    "    return bucket"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def analyse_xgemm_kernel(df_model_lib, model, lib, kernel):\n",
    "    # Get trace for lib and model.\n",
    "    trace = pw.index_calls(df_model_lib.loc[model].loc[lib]['dvdt_prof_info'])\n",
    "    # All calls to set kernel args.\n",
    "    set_args = pw.filter_calls(trace, ['clSetKernelArg']) \n",
    "    # All kernel enqueues.\n",
    "    nqs = pw.filter_calls(trace, ['clEnqueueNDRangeKernel'])\n",
    "    # Construct a DataFrame with info about kernel enqueues.\n",
    "    df = pw.df_kernel_enqueues(nqs, unit='ms').swaplevel().ix[kernel]\n",
    "    df = df[['p3 - p2 (ms)', 'gws2']]\n",
    "    # As gws2 is always 1, we can use it to count the number of enqueues.\n",
    "    df.columns = [ '** Execution time (ms) **', '** Number of enqueues **' ]\n",
    "    df.columns.name = kernel\n",
    "    # Augment the DataFrame with columns for the (M, N, K) triples.\n",
    "    df['kSizeM'] = 'M'; df['bSizeM'] = 'MM'\n",
    "    df['kSizeN'] = 'N'; df['bSizeN'] = 'NN'\n",
    "    df['kSizeK'] = 'K'; df['bSizeK'] = 'KK'\n",
    "    # Initialise buckets.\n",
    "    buckets = init_buckets()\n",
    "    # Augment the DataFrame with the actual (M, N, K) triples.\n",
    "    mnk_triples = []; mmnnkk_triples = []\n",
    "    for nq in nqs:\n",
    "        if nq['name'] == kernel:\n",
    "            prof = nq['profiling']\n",
    "            (M, N, K) = ('M', 'N', 'K'); (MM, NN, KK) = ('MM', 'NN', 'KK')\n",
    "            for set_arg in set_args:\n",
    "                if (set_arg['call_index'] > nq['call_index']): break\n",
    "                if (set_arg['kernel'] != nq['kernel']): continue\n",
    "                arg_value = pc.hex_str_as_int(set_arg['arg_value'])\n",
    "                if (set_arg['arg_index'] == 0): M = arg_value; MM = arg_value\n",
    "                if (set_arg['arg_index'] == 1): N = arg_value; NN = arg_value\n",
    "                if (set_arg['arg_index'] == 2): K = arg_value; KK = arg_value\n",
    "            mnk_triples.append((M, N, K))\n",
    "            mmnnkk_triples.append(get_nearest_bucket(buckets, (M, N, K)))\n",
    "    df[['kSizeM', 'kSizeN', 'kSizeK']] = mnk_triples\n",
    "    df[['bSizeM', 'bSizeN', 'bSizeK']] = mmnnkk_triples\n",
    "    # Calculate Gflops and GFLOPS (Gflops/s).\n",
    "    df['** Gflops **'] = 2*df['kSizeM']*df['kSizeN']*df['kSizeK']*1e-9\n",
    "    df['** GFLOPS **'] = df['** Gflops **'] / (df['** Execution time (ms) **']*1e-3)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "model_lib_kernel_analysis = {}\n",
    "for model in models:\n",
    "    for lib in libs:\n",
    "        title = concat(model, lib)\n",
    "        print('== %s ==' % title)\n",
    "        try:\n",
    "            analysis = model_lib_analysis[title]\n",
    "        except:\n",
    "            print(' ... missing ...'); print(''); continue\n",
    "        for kernel in analysis.index:\n",
    "            if kernel.lower().find('xgemm') == -1: continue\n",
    "            analysis_xgemm = analyse_xgemm_kernel(df_model_lib, model, lib, kernel)\n",
    "            pd.options.display.max_columns = analysis_xgemm.columns.size\n",
    "            pd.options.display.max_rows = analysis_xgemm.index.size\n",
    "            display(analysis_xgemm)\n",
    "            analysis_xgemm_stats = analysis_xgemm.describe()\n",
    "            pd.options.display.max_columns = analysis_xgemm_stats.columns.size\n",
    "            pd.options.display.max_rows = analysis_xgemm_stats.index.size\n",
    "            display(analysis_xgemm_stats)\n",
    "            model_lib_kernel_analysis[concat(title, kernel)] = analysis_xgemm\n",
    "            print('')\n",
    "        print('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Analyse xGEMM buckets for tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "min_pc = 0\n",
    "model_lib_bucket_analysis = {}\n",
    "for xgemm_analysis_key in model_lib_kernel_analysis:\n",
    "    print('== %s ==' % xgemm_analysis_key)\n",
    "    # Move the actual and bucket triples into the index to preserve them during the aggregation that follows.\n",
    "    xgemm_analysis = model_lib_kernel_analysis[xgemm_analysis_key] \\\n",
    "        .set_index(['kSizeM', 'kSizeN', 'kSizeK', 'bSizeM', 'bSizeN', 'bSizeK'])\n",
    "    # Aggregate the execution time and the number of enqueues.\n",
    "    xgemm_analysis = xgemm_analysis \\\n",
    "        .groupby(level=xgemm_analysis.index.names).sum()\n",
    "    # Move the actual triples back to the columns.\n",
    "    xgemm_analysis = xgemm_analysis \\\n",
    "        .reset_index(level=['kSizeM', 'kSizeN', 'kSizeK'])\n",
    "    xgemm_analysis.name = xgemm_analysis_key\n",
    "    # Calculate the execution time in percent.\n",
    "    xgemm_analysis['** Execution time (%) **'] = 100 * ( \\\n",
    "         xgemm_analysis['** Execution time (%s) **' % unit] / \\\n",
    "         xgemm_analysis['** Execution time (%s) **' % unit].sum())\n",
    "    # Calculate GFLOPS taking into account that the execution time is accumulated over several enqueues.\n",
    "    xgemm_analysis['** Gflops **'] = xgemm_analysis['** Number of enqueues **'] * \\\n",
    "        (2*xgemm_analysis['kSizeM']*xgemm_analysis['kSizeN']*xgemm_analysis['kSizeK']*1e-9)\n",
    "    xgemm_analysis['** Gflops (%) **'] = 100 * (xgemm_analysis['** Gflops **'] / xgemm_analysis['** Gflops **'].sum())\n",
    "    xgemm_analysis['** GFLOPS **'] = \\\n",
    "        xgemm_analysis['** Gflops **'] / (xgemm_analysis['** Execution time (ms) **']*1e-3)\n",
    "    # Sort by the kernel operations in the descending order.\n",
    "    pd.options.display.max_columns = len(xgemm_analysis.columns)\n",
    "    pd.options.display.max_rows = len(xgemm_analysis.index)\n",
    "    display(\n",
    "        xgemm_analysis[xgemm_analysis['** Gflops (%) **'] > min_pc] \\\n",
    "        .sort_values(by=['** Gflops (%) **'], ascending=False)\n",
    "    )\n",
    "    model_lib_bucket_analysis[xgemm_analysis_key] = xgemm_analysis\n",
    "    print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "bucket_analysis_dir = os.path.join(os.path.curdir, 'bucket-analysis-%s-tmp' % platform_id)\n",
    "if not os.path.exists(bucket_analysis_dir):\n",
    "    os.makedirs(bucket_analysis_dir)\n",
    "\n",
    "for bucket_analysis_key in model_lib_bucket_analysis:\n",
    "    print('== %s ==' % bucket_analysis_key)\n",
    "    if bucket_analysis_key.find('tune') == -1:\n",
    "        print('... skipping no-tune ...'); print(''); continue\n",
    "    bucket_analysis = model_lib_bucket_analysis[bucket_analysis_key]\n",
    "    # Move the actual triples into the index to preserve them during the aggregation that follows.\n",
    "    bucket_analysis = model_lib_bucket_analysis[bucket_analysis_key] \\\n",
    "        .set_index(['kSizeM', 'kSizeN', 'kSizeK', '** GFLOPS **'], append=True)\n",
    "    # Aggregate by the bucket size.\n",
    "    bucket_analysis = bucket_analysis \\\n",
    "        .groupby(level=['bSizeM', 'bSizeN', 'bSizeK']).sum()\n",
    "    # Sort by the kernel operations in the descending order.\n",
    "    bucket_analysis = \\\n",
    "        bucket_analysis[bucket_analysis['** Gflops (%) **'] > min_pc] \\\n",
    "        .sort_values(by=['** Gflops (%) **'], ascending=False)\n",
    "    # Display.\n",
    "    pd.options.display.max_columns = len(bucket_analysis.columns)\n",
    "    pd.options.display.max_rows = len(bucket_analysis.index)\n",
    "    display(bucket_analysis)\n",
    "    # Dump buckets analysis to JSON.\n",
    "    bucket_stats = []\n",
    "    for index, row in bucket_analysis.iterrows():\n",
    "        stats = {}\n",
    "        stats['bSizeM'] = index[0]\n",
    "        stats['bSizeN'] = index[1]\n",
    "        stats['bSizeK'] = index[2]\n",
    "        stats['Number of enqueues']  = row['** Number of enqueues **']\n",
    "        stats['Execution time (ms)'] = row['** Execution time (ms) **']\n",
    "        stats['Execution time (%)']  = row['** Execution time (%) **']\n",
    "        stats['Gflops (%)'] = row['** Gflops (%) **']\n",
    "        stats['Gflops'] = row['** Gflops **']\n",
    "        bucket_stats.append(stats)\n",
    "    bucket_analysis_file = os.path.join(bucket_analysis_dir, '%s.json' % bucket_analysis_key)\n",
    "    with open(bucket_analysis_file, 'w') as f:\n",
    "        json.dump(bucket_stats, f, indent=2)\n",
    "    print('')"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
